In [ ]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from hanspell import spell_checker #한글 맞춤법 검사기 (python 2.7) #https://github.com/ssut/py-hanspell

Project Crawling

1. CrowdFunding 사이트 크롤링

  • wadiz_df : project_id(프로젝트넘버), title, area, category, target(목표펀딩금액), result(최종펀딩금액), money_support(현금지지), sign_support(서명지지)
  • project_money_all : project_id(프로젝트넘버), funding_money(개인별 펀딩금액), funding_date(펀딩날짜)

In [ ]:
wadiz_df = pd.DataFrame(columns=["project_id", "title", "area", "category", "target", "result", "duration", "comment_all", "comment_user",
                                   "comment_provider", "money_supporter", "sign_supporter"]) 
project_money_all = pd.DataFrame()
for page in range(1, 100):
    try:
        project_id = page 
        response = requests.get("http://www.wadiz.kr/web/campaign/detail/{page_num}".format(page_num=project_id))
        #print(project_id)
        dom = BeautifulSoup(response.content, "html.parser")
        title_1 = dom.select("div.wd-ui-title-wrap h1.wd-h1")
        title = title_1[0].text
        area_1 = dom.select("div.wd-ui-campaign-info li.wd-data-area")
        area = area_1[0].text
        category_1 = dom.select("div.wd-ui-campaign-info li.wd-data-tag")
        category = category_1[0].text
        target_1 = dom.select("div.wd-info-target em.wd-data-target")
        target = target_1[0].text
        result_1 = dom.select("div.wd-ui-target-old span.wd-data-collection")
        result = result_1[0].find("em").text
        comment_all_1 = dom.select_one("div.wd-ui-tab-wrap")
        comment_all = comment_all_1.find_all("li")[1].text[18:-3]
        comment_provider_1 = dom.select("ul.wd-list-reply")
        comment_provider = len(comment_provider_1)
        comment_user = int(comment_all) - comment_provider
        number_join_all = dom.select_one("li.wd-last").text[17:-6]
        number_money_1 = dom.select("li.wd-data-money")
        number_money = number_money_1[0].text[6:-1][:-1]
        number_supporter_1 = dom.select("li.wd-data-sign")
        number_supporter = number_supporter_1[0].text[6:-1][:-1]
        duration_1 = dom.select("li.wd-data-date")
        duration = duration_1[0].text[-23:]
        #print(title, area, category, target, result)
        wadiz_df.loc[len(wadiz_df)] = [project_id, title, area, category, target, result, duration, comment_all, 
                                         comment_user, comment_provider, number_money, number_supporter]
        p_id = page
        response_1 = requests.get("http://www.wadiz.kr/web/campaign/detailBacker/{project_num}".format(project_num = p_id))
        dom_1 = BeautifulSoup(response_1.content, "html.parser")
        dom_1.select("span.wd-data-sponsor")
        a = pd.Series(dom_1.select("span.wd-data-sponsor strong")[1::2])
        b = a.apply(lambda x: x.text[97:-93])
        b = b.apply(lambda x: x.replace(",", ""))
        time = dom_1.select("span.wd-data-sponsor script")
        date = pd.Series()
        for i in time:
            date_1 = i.text[67:-53]
            date.loc[len(date)] = date_1
        date = date.apply(lambda x: x[0:10])
        p = pd.DataFrame(columns=["project_id"])
        project_money = pd.concat([p, b, date], axis=1).fillna(p_id)
        project_money = project_money.rename(columns={0 : "funding_money", 1: "funding_date"})
        project_money = project_money.loc[project_money["funding_money"] != ""]
        project_money_all = project_money_all.append(project_money)
        #print(p_id)
    except: 
        continue
#project_money_all = project_money_all[project_money_all['funding_money'] != ""]
project_money_all.index = np.arange(len(project_money_all))

In [ ]:
# Data 저장
wadiz_df.to_csv('wadiz_df_0329.csv', encoding='utf-8')
project_money_all.to_csv('project_money_all_0329.csv')

2. wadiz_df Data 정리

  • 최종 금액 0원 초과만 재분류
  • 날짜처리 : '2016.01.01 - 2016.03.01' -> date_start : 2016.01.01, date_end : 2016.03.01
  • 날짜 이상치 처리
  • 펀딩 기간 추가 : date_start - date_end
  • year, month 나누기
  • 한글명 카테고리 -> 영문명으로 변경 (python 2.7 작업 환경에 용이하기 때문)
  • LabelEncoding
  • OneHotEncoding
  • funding_rate column 생성 : result / target
  • success/fail column : success =1, fail = 0

In [ ]:
# 최종 금액 0원 초과만 재분류
wadiz_df = wadiz_df[wadiz_df["result"] > 0]

In [ ]:
# 날짜 처리
date = np.vstack(wadiz_df.duration.astype(str).apply(lambda x: list(map(str, x.split('-')))).values)
wadiz_df["date_start"] = date[:,0]
wadiz_df["date_end"] = date[:,1]
wadiz_df.drop("duration", axis=1, inplace=True)

In [ ]:
# 날짜 이상치 처리
wadiz_df = wadiz_df[wadiz_df['date_start'] != '\t\t\t\t\t\t\t\t\t\t ']

In [ ]:
wadiz_df["date_start"] = pd.to_datetime(wadiz_df["date_start"])
wadiz_df["date_end"] = pd.to_datetime(wadiz_df["date_end"])

In [ ]:
# 펀딩 기간 추가 (date_duration)
wadiz_df["date_duration"] = wadiz_df["date_end"] - wadiz_df["date_start"]

In [ ]:
wadiz_df.head()

In [ ]:
# year, month 뽑기
wadiz_df['year'] = wadiz_df['date_start'].apply(lambda x : x.year)
wadiz_df['month'] = wadiz_df['date_start'].apply(lambda x: x.month)

In [ ]:
# 한글-> 영문처리
wadiz_df["area"][wadiz_df["area"] == u'서울특별시'] = 'seoul'
wadiz_df["area"][wadiz_df["area"] == u'경기도'] = 'kyungki'
wadiz_df["area"][wadiz_df["area"] == u'부산광역시'] = 'busan'
wadiz_df["area"][wadiz_df["area"] == u'인천광역시'] = 'incheon'
wadiz_df["area"][wadiz_df["area"] == u'경상북도'] = 'kyungbuk'
wadiz_df["area"][wadiz_df["area"] == u'전라북도'] = 'jeonbuk'
wadiz_df["area"][wadiz_df["area"] == u'강원도'] = 'kangwon'
wadiz_df["area"][wadiz_df["area"] == u'대구광역시'] = 'deagu'
wadiz_df["area"][wadiz_df["area"] == u'충청남도'] = 'chungnam'
wadiz_df["area"][wadiz_df["area"] == u'충청북도'] = 'chungbuk'
wadiz_df["area"][wadiz_df["area"] == u'대전광역시'] = 'deajeon'
wadiz_df["area"][wadiz_df["area"] == u'광주광역시'] = 'gwangju'
wadiz_df["area"][wadiz_df["area"] == u'경상남도'] = 'kyungnam'
wadiz_df["area"][wadiz_df["area"] == u'제주특별자치도'] = 'jeju'
wadiz_df["area"][wadiz_df["area"] == u'울산광역시'] = 'ulsan'
wadiz_df["area"][wadiz_df["area"] == u'전라남도'] = 'jeonnam'
wadiz_df["area"][wadiz_df["area"] == u'세종특별자치시'] = 'sejong'
wadiz_df["category"][wadiz_df["category"] == u"나눔/공익"] = 'share/public'
wadiz_df["category"][wadiz_df["category"] == u"라이프/패션"] = 'life/fashion'
wadiz_df["category"][wadiz_df["category"] == u"테크/디자인"] = 'tech/design'
wadiz_df["category"][wadiz_df["category"] == u"교육"] = 'education'
wadiz_df["category"][wadiz_df["category"] == u"책/영화"] = 'book/movie'
wadiz_df["category"][wadiz_df["category"] == u"음악/공연"] = 'music/concert'
wadiz_df["category"][wadiz_df["category"] == u"미술/사진/전시"] = 'art/photo/exhibit'
wadiz_df["category"][wadiz_df["category"] == u"환경"] = 'environment'
wadiz_df["category"][wadiz_df["category"] == u"스포츠"] = 'sports'
wadiz_df["category"][wadiz_df["category"] == u"여행"] = 'travel'
wadiz_df["category"][wadiz_df["category"] == u"게임/만화"] = 'game/comics'
wadiz_df["category"][wadiz_df["category"] == u"피규어/웹툰"] = 'figure/webtoon'

In [ ]:
# category 이상치 처리
wadiz_df["category"].fillna('etc', inplace = True)

In [ ]:
# area, category -> LabelEncoding
le = LabelEncoder()
wadiz_df["category_label"] = le.fit_transform(wadiz_df["category"])
wadiz_df["area_label"] = le.fit_transform(wadiz_df["area"])

In [ ]:
# area, category -> OneHotEncoding
category_dummy = pd.get_dummies(wadiz_df['category'], prefix = 'category_label')
area_dummy = pd.get_dummies(wadiz_df['area'], prefix = 'category_label')
month = pd.get_dummies(wadiz_df.month, prefix="month")
year = pd.get_dummies(wadiz_df.year, prefix="year")
wadiz_df = pd.concat([wadiz_df, category_dummy, area_dummy, year, month], axis=1)

In [ ]:
# 콤마 제거
wadiz_df['result'] = wadiz_df['result'].apply(lambda x: x.replace(",", ""))
wadiz_df['target'] = wadiz_df['target'].apply(lambda x: x.replace(",", ""))

In [ ]:
# int 변환
wadiz_df['result'] = wadiz_df['result'].apply(lambda x : int(x))
wadiz_df['target'] = wadiz_df['target'].apply(lambda x : int(x))

In [ ]:
# funding_rate 생성
# Success/Fail 나누기
wadiz_df["funding_rate"] = wadiz_df["result"] / wadiz_df["target"]
wadiz_df["success"] = wadiz_df["result"] / wadiz_df["target"]
wadiz_df["success"][wadiz_df['funding_rate']>=1] = 1
wadiz_df["success"][wadiz_df['funding_rate']<1] = 0

3. Project_money Data 처리

  • 날짜별 펀딩된 금액 중 0~5일 內 펀딩금액만 처리

In [ ]:
wadiz_df['project_id'] = wadiz_df['project_id'].apply(lambda x: int(x))

In [ ]:
# 날짜 계산용 DataFrame 생성
date_difference = pd.merge(project_money_all, wadiz_df, on="project_id")

In [ ]:
# funding_date 처리
project_money_all["funding_date"] = pd.to_datetime(project_money_all["funding_date"])
date_difference["funding_date"] = pd.to_datetime(date_difference["funding_date"])
date_difference["date_start"] = pd.to_datetime(date_difference["date_start"])

In [ ]:
date_difference["funding_date"] - date_difference["date_start"]

In [ ]:
# 프로젝트 개설일과 개인별 펀딩일 차이
project_money_all["date_difference"] = date_difference["funding_date"] - date_difference["date_start"] 
# NaN값 제거 (이상치)
project_money_all['date_difference'] = project_money_all['date_difference'].fillna('-1')
project_money_all = project_money_all[project_money_all['date_difference'] >= '0 days']

In [ ]:
# 날짜 처리
project_money_all["date_difference"] = project_money_all["date_difference"].apply(lambda x: int(x)/8.640000e+13)
#project_money_all = project_money_all[project_money_all["date_difference"] >= 0]

In [ ]:
project_money_all

In [ ]:
type(project_money_all['date_difference'][0])

In [ ]:
project_money_all["0day_difference"] = np.ones(len(project_money_all))

In [ ]:
# 0~5일 이내 funding된 금액만 처리
for i in np.arange(6):
    number = i
    project_money_all["{number}day_difference".format(number = i)] = np.ones(len(project_money_all))
    project_money_all["{number}day_difference".format(number = i)][project_money_all["date_difference"] <= number] = "short"
    project_money_all["{number}day_difference".format(number = i)][project_money_all["date_difference"] > number] = "long"

In [ ]:
project_money_all['funding_money'] = project_money_all['funding_money'].apply(lambda x: int(x))

In [ ]:
zero_day = project_money_all.loc[project_money_all["0day_difference"] == "short"]
one_day = project_money_all.loc[project_money_all["1day_difference"] == "short"]
two_day = project_money_all.loc[project_money_all["2day_difference"] == "short"]
three_day = project_money_all.loc[project_money_all["3day_difference"] == "short"]
four_day = project_money_all.loc[project_money_all["4day_difference"] == "short"]
five_day = project_money_all.loc[project_money_all["5day_difference"] == "short"]

In [ ]:
zero_day = zero_day.groupby("project_id", as_index=False).sum()
one_day = one_day.groupby("project_id", as_index=False).sum()
two_day = two_day.groupby("project_id", as_index=False).sum()
three_day = three_day.groupby("project_id", as_index=False).sum()
four_day = four_day.groupby("project_id", as_index=False).sum()
five_day = five_day.groupby("project_id", as_index=False).sum()

In [ ]:
zero_day = zero_day.rename(columns={"funding_money" : "0day_funding_money"})
one_day = one_day.rename(columns={"funding_money" : "1day_funding_money"})
two_day = two_day.rename(columns={"funding_money" : "2day_funding_money"})
three_day = three_day.rename(columns={"funding_money" : "3day_funding_money"})
four_day = four_day.rename(columns={"funding_money" : "4day_funding_money"})
five_day = five_day.rename(columns={"funding_money" : "5day_funding_money"})

In [ ]:
zero_day = zero_day.rename(columns={"date_difference" : "0day_date"})
one_day = one_day.rename(columns={"date_difference" : "1day_date"})
two_day = two_day.rename(columns={"date_difference" : "2day_date"})
three_day = three_day.rename(columns={"date_difference" : "3day_date"})
four_day = four_day.rename(columns={"date_difference" : "4day_date"})
five_day = five_day.rename(columns={"date_difference" : "5day_date"})

4. Project_money_all, Wadiz_df 합치기

  • 각 프로젝트별 0~5일 內 누적금액이 얼마인지 확인가능

In [ ]:
wadiz_df = pd.merge(wadiz_df, zero_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, one_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, two_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, three_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, four_day, on = "project_id", how='outer')
wadiz_df = pd.merge(wadiz_df, five_day, on = "project_id", how='outer')

5. Data 추가


In [ ]:
# NaN 값 체크
# Nan값은 0~5일내에 펀딩된 금액이 없는 것을 뜻함
for i in wadiz_df.columns:
    column = i
    print(len(wadiz_df.loc[wadiz_df["{column}".format(column = i)].isnull() == True]))

In [ ]:
wadiz_df.fillna(0, inplace=True)

In [ ]:
#NaN 값 다시 체크
for i in wadiz_df.columns:
    column = i
    print(len(wadiz_df.loc[wadiz_df["{column}".format(column = i)].isnull() == True]))

In [ ]:
#funding_rate 생성
for i in np.arange(6):
    number = i
    wadiz_df["{number}day_funding_rate".format(number = i)] = \
    wadiz_df["{number}day_funding_money".format(number = i)]/wadiz_df["target"]

In [ ]:
# funding_rate 1 이상인 값들 체크
for i in np.arange(6):
    number = i
    print(len(wadiz_df.loc[wadiz_df["{number}day_funding_rate".format(number = i)] >= 1]))

In [ ]:
# funding_rate -> log scale
for i in np.arange(6):
    number = i
    wadiz_df["{number}day_log_funding_rate".format(number = i)] = wadiz_df["{number}day_funding_rate"\
                                                                           .format(number = i)].apply(lambda x: np.log(x))

In [ ]:
wadiz_df.to_csv('wadiz_df_0329_1.csv', encoding='utf-8')

5. Comment Crawling

  • 각 프로젝트별 지지자, 개설자 댓글 크롤링
  • user_data_all : 지지자 댓글
  • provider_data_all : 개설자 댓글

In [ ]:
project_id = wadiz_df.project_id
user_data = pd.DataFrame(columns=['project_id', 'user_id', 'comment', 'date'])
user_data_all = pd.DataFrame()
provider_data = pd.DataFrame(columns=['project_id', 'provider_id', 'comment', 'date'])
provider_data_all = pd.DataFrame()
for i in project_id[0:]:
    project_id_list = i
    response = requests.get('https://www.wadiz.kr/web/campaign/detail/qa/{project_id_list}'.format(project_id_list = i))
    dom = BeautifulSoup(response.content, 'html.parser')
    user_all = dom.select('div.wd-ui-recommend li.')
    print(project_id_list)
    if len(user_all) == 0:
        pass
    else:
        for number in np.arange(len(user_all)):
            user = user_all[number]
            user_url = user.select_one('a.wd-data-name').get('href')
            user_comment = user.select_one('span').text
            try:
                user_date = user.select_one('span.wd-data-whenCreated').text
            except:
                continue
            user_data.loc[len(user_data)] = [project_id_list, user_url, user_comment, user_date]
    provider_all = dom.select('ul.wd-list-reply')
    #print(project_id_list)
    for number in np.arange(len(provider_all)):
        provider = provider_all[number]
        provider_url = provider.select_one('a.wd-data-name').get('href')
        provider_comment = provider.select('span')[-2].text
        provider_date = provider.select('span')[-1].text
        provider_data.loc[len(provider_data)] = [project_id_list, provider_url, provider_comment, provider_date]
user_data_all = user_data_all.append(user_data)    
provider_data_all = provider_data_all.append(provider_data)

In [ ]:
user_data_all.to_csv('user_data_all_0329.csv', encoding='utf-8')
provider_data_all.to_csv('provider_data_all_0329.csv', encoding='utf-8')

6. 맞춤법 검사

  • #한글 맞춤법 검사기 (python 2.7, https://github.com/ssut/py-hanspell)
  • 개설자 댓글 맞춤법 검사
  • error 갯수 합산, 각 댓글별 어절 수 계산
  • provider_grammar_level : error / comment_length(어절 수)

In [ ]:
comment_analysis = pd.DataFrame(columns={'project_id', 'provider_id',
                         'result', 'original', 'checked', 'words', 'time', 'comment_length'})
for i in np.arange(len(provider_data_all)):
    try:
        result = spell_checker.check(provider_data_all['comment'][i])
        comment = pd.DataFrame(provider_data_all.loc[i]).T
        comment_result = pd.DataFrame([result])
        comment_result.index = comment.index
        comment_result_df = comment.join(comment_result)
        comment_analysis = comment_analysis.append(comment_result_df)
        comment_analysis['comment_length'][i] = len(comment_analysis['words'][i])
        if i in 100*np.arange(220):
            print(i)
    except:
        continue

In [ ]:
# comment_error 생성
comment_error = pd.DataFrame([comment_analysis.project_id, comment_analysis.errors, 
                              comment_analysis.provider_id, comment_analysis.comment_length]).T
# data int타입으로 전환
comment_error['errors'] = comment_error['errors'].apply(lambda x: int(x))
comment_error['comment_length'] = comment_error['comment_length'].apply(lambda x: int(x))

In [ ]:
# comment error 처리
comment_error['errors'] = comment_error['errors'].apply(lambda x: int(x))
comment_error['comment_length'] = comment_error['comment_length'].apply(lambda x: int(x))

In [ ]:
# id로 groupby
comment_error = comment_error.groupby(by='project_id', as_index=False).sum()

In [ ]:
# grammar_level 생성
# 각 댓글에 속한 error를 전체 어절로 나눔
comment_error['provider_grammar_level'] = comment_error['errors']/comment_error['comment_length']

In [ ]:
comment_analysis.to_csv('comment_analysis.csv', encoding='utf-8')

7. 개설자 댓글이 존재하는 Project만 선정

  • wadiz_provider_analysis : 총 샘플 수 599개 (

In [ ]:
wadiz_provider_analysis = pd.merge(wadiz_df, comment_error, how='inner', on= 'project_id')

In [ ]:
wadiz_provider_analysis.to_csv('wadiz_provider_analysis_0329.csv', encoding='utf-8')

In [ ]:
wadiz_df.head()

In [ ]:
wadiz_provider_analysis

In [ ]:
project_money_all

In [ ]:
provider_data_all

In [ ]:
user_data_all

In [ ]: